The MS COCO 2014 dataset has 40,000 images for validation, and 80,000 for training. Text annotations are only available for the training set.
Note that the .zip file is relatively large (about 14 GB), so the following cell will take a long time to execute.
In [ ]:
%%bash
mkdir ../data
cd ../data
curl -O http://msvocds.blob.core.windows.net/coco2014/train2014.zip
unzip train2014.zip
rm train2014.zip
The COCO-Text dataset has 170,000 text instances; about half of the MS COCO images contain text. Annotations are provided for both the training set and the validation set.
In [ ]:
%%bash
cd ../data
curl -O https://s3.amazonaws.com/cocotext/COCO_Text.zip
unzip COCO_Text.zip
rm COCO_Text.zip
In [ ]:
from __future__ import absolute_import, division, print_function
# The maximum number of training images per class
N_TRAIN = 30000
# The maximum number of validation images per class
N_VALID = 15000
# The minimum overlap area between an annotation and a segment
# if has_text(...) returns True
OVERLAP_THRESHOLD = 500
# The size of the sides of the segments, in pixels
SEGMENT_SIZE = 128
In [ ]:
def filename_to_id(filename):
# Extract ID
image_id = filename.split('_')[2].split('.')[0]
# Remove leading zeros
image_id = str(int(image_id))
return image_id
In [ ]:
def calculate_overlap(rect1, rect2):
"""Calculate the overlap between two rectangles.
Assumes that the rectangles are provided as 4-tuples, and the values are as follows:
0. X coordinate of the top-left corner
1. Y coordinate of the top-left corner
2. X coordinate of the bottom-right corner
3. Y coordinate of the bottom-right corner
"""
horizontal_overlap = max(0, min(rect1[2], rect2[2]) - max(rect1[0], rect2[0]))
vertical_overlap = max(0, min(rect1[3], rect2[3]) - max(rect1[1], rect2[1]))
return horizontal_overlap * vertical_overlap
In [ ]:
def has_text(image_id, segment_rect, annotation_data):
global OVERLAP_THRESHOLD
try:
annotation_ids = annotation_data['imgToAnns'][image_id]
except KeyError:
annotation_ids = []
annotation_ids = [str(annotation_id) for annotation_id in annotation_ids]
has_text = False
for annotation_id in annotation_ids:
annotation = annotation_data['anns'][annotation_id]
legible = annotation['legibility'] == 'legible'
english = annotation['language'] == 'english'
if legible and english:
bounding_box = annotation['bbox']
x0, y0 = bounding_box[1], bounding_box[0]
x3, y3 = x0 + bounding_box[3], y0 + bounding_box[2]
annotation_rect = (x0, y0, x3, y3)
overlap = calculate_overlap(annotation_rect, segment_rect)
if overlap > OVERLAP_THRESHOLD:
has_text = True
break
return has_text
In [ ]:
from PIL import Image
from math import ceil
import numpy as np
def get_segments(filename):
"""Loads the specified file from disk and converts it to multiple segments.
The segments are single channel images with size (SEGMENT_SIZE x SEGMENT_SIZE).
Padding is done using uniformly distributed random values.
"""
global SEGMENT_SIZE
image = Image.open('../data/train2014/' + filename)
segments = []
segment_rects = []
width, height = image.size
padded_width = int(ceil(width / SEGMENT_SIZE) * SEGMENT_SIZE)
padded_height = int(ceil(height / SEGMENT_SIZE) * SEGMENT_SIZE)
padded_image = np.random.uniform(0, 256, (padded_height, padded_width))
# Convert the loaded image to grayscale and add random padding
try:
b, g, r = image.split()
gray_image = np.multiply(0.21, r) + np.multiply(0.72, g) + np.multiply(0.07, b)
padded_image[:height, :width] = gray_image
except ValueError:
padded_image[:height, :width] = image
for x in range(0, padded_width, SEGMENT_SIZE):
for y in range(0, padded_height, SEGMENT_SIZE):
segment = padded_image[x:x+SEGMENT_SIZE, y:y+SEGMENT_SIZE]
segments.append(segment)
rect = (x, y, x + SEGMENT_SIZE, y + SEGMENT_SIZE)
segment_rects.append(rect)
return segments, segment_rects
In [ ]:
from scipy.misc import imsave
def save_image(path, image):
"""Tries to save a given image to a given path.
Returns: the number of images successfully saved. (Either 0 or 1.)
"""
try:
imsave(path, image)
except ValueError:
return 0
return 1
In [ ]:
%%bash
cd ../data
mkdir train
mkdir train/text
mkdir train/no-text
mkdir valid
mkdir valid/text
mkdir valid/no-text
In [ ]:
from os import walk
import json
images = next(walk('../data/train2014'))[2]
print('The dataset has ' + str(len(images)) + ' images.')
annotation_data = json.load(open('../data/COCO_Text.json'))
print('Text annotations loaded.')
In [ ]:
import random
random.shuffle(images)
In [ ]:
n_text = 0
n_no_text = 0
i_last_training_image = 0
for filename in images:
if n_text == N_TRAIN and n_no_text == N_TRAIN:
break
i_last_training_image += 1
image_id = filename_to_id(filename)
segments, segment_rects = get_segments(filename)
for i in range(len(segments)):
if has_text(image_id, segment_rects[i], annotation_data) and n_text < N_TRAIN:
n_text += save_image('../data/train/text/' + str(n_text) + '.jpg', segments[i])
elif n_no_text < N_TRAIN:
n_no_text += save_image('../data/train/no-text/' + str(n_no_text) + '.jpg', segments[i])
print('Successfully processed: ' + str(n_text + n_no_text) + ' / ' + str(N_TRAIN * 2))
In [ ]:
n_text = 0
n_no_text = 0
for filename in images[i_last_training_image:]:
if n_text == N_VALID and n_no_text == N_VALID:
break
image_id = filename_to_id(filename)
segments, segment_rects = get_segments(filename)
for i in range(len(segments)):
if has_text(image_id, segment_rects[i], annotation_data) and n_text < N_VALID:
n_text += save_image('../data/valid/text/' + str(n_text) + '.jpg', segments[i])
elif n_no_text < N_VALID:
n_no_text += save_image('../data/valid/no-text/' + str(n_no_text) + '.jpg', segments[i])
print('Successfully processed: ' + str(n_text + n_no_text) + ' / ' + str(N_VALID * 2))